`

Load the tweets and check if they are loaded correctly

# Set working directory
# getwd()
# setwd("./data/")

# Load data
load("../data/Tweets_all.rda")

# Check that tweets are loaded
head(tweets)
## # A tibble: 6 × 14
##   created_at               id id_str            full_text in_reply_to_screen_n…¹
##   <dttm>                <dbl> <chr>             <chr>     <chr>                 
## 1 2023-01-20 17:17:32 1.62e18 1616469988369469… "Im MSc … <NA>                  
## 2 2023-01-13 07:52:01 1.61e18 1613790954737074… "Was bew… <NA>                  
## 3 2023-01-12 19:30:01 1.61e18 1613604227141537… "Was uns… <NA>                  
## 4 2023-01-12 08:23:00 1.61e18 1613436367169634… "Eine di… <NA>                  
## 5 2023-01-11 14:00:05 1.61e18 1613158809081450… "Wir gra… <NA>                  
## 6 2023-01-10 17:06:11 1.61e18 1612843252083834… "Unsere … <NA>                  
## # ℹ abbreviated name: ¹​in_reply_to_screen_name
## # ℹ 9 more variables: retweet_count <int>, favorite_count <int>, lang <chr>,
## #   university <chr>, tweet_date <dttm>, tweet_minute <dttm>,
## #   tweet_hour <dttm>, tweet_month <date>, timeofday_hour <chr>
summary(tweets)
##    created_at                          id               id_str         
##  Min.   :2009-09-29 14:29:47.0   Min.   :4.469e+09   Length:19575      
##  1st Qu.:2015-01-28 15:07:41.5   1st Qu.:5.604e+17   Class :character  
##  Median :2018-04-13 13:26:56.0   Median :9.848e+17   Mode  :character  
##  Mean   :2017-12-09 15:26:50.7   Mean   :9.400e+17                     
##  3rd Qu.:2020-10-20 10:34:50.0   3rd Qu.:1.318e+18                     
##  Max.   :2023-01-26 14:49:31.0   Max.   :1.619e+18                     
##   full_text         in_reply_to_screen_name retweet_count     favorite_count  
##  Length:19575       Length:19575            Min.   :  0.000   Min.   :  0.00  
##  Class :character   Class :character        1st Qu.:  0.000   1st Qu.:  0.00  
##  Mode  :character   Mode  :character        Median :  1.000   Median :  0.00  
##                                             Mean   :  1.289   Mean   :  1.37  
##                                             3rd Qu.:  2.000   3rd Qu.:  2.00  
##                                             Max.   :267.000   Max.   :188.00  
##      lang            university          tweet_date                    
##  Length:19575       Length:19575       Min.   :2009-09-29 00:00:00.00  
##  Class :character   Class :character   1st Qu.:2015-01-28 00:00:00.00  
##  Mode  :character   Mode  :character   Median :2018-04-13 00:00:00.00  
##                                        Mean   :2017-12-09 02:25:45.00  
##                                        3rd Qu.:2020-10-20 00:00:00.00  
##                                        Max.   :2023-01-26 00:00:00.00  
##   tweet_minute                      tweet_hour                    
##  Min.   :2009-09-29 14:29:00.00   Min.   :2009-09-29 14:00:00.00  
##  1st Qu.:2015-01-28 15:07:00.00   1st Qu.:2015-01-28 14:30:00.00  
##  Median :2018-04-13 13:26:00.00   Median :2018-04-13 13:00:00.00  
##  Mean   :2017-12-09 15:26:24.68   Mean   :2017-12-09 14:59:43.81  
##  3rd Qu.:2020-10-20 10:34:30.00   3rd Qu.:2020-10-20 10:00:00.00  
##  Max.   :2023-01-26 14:49:00.00   Max.   :2023-01-26 14:00:00.00  
##   tweet_month         timeofday_hour    
##  Min.   :2009-09-01   Length:19575      
##  1st Qu.:2015-01-01   Class :character  
##  Median :2018-04-01   Mode  :character  
##  Mean   :2017-11-24                     
##  3rd Qu.:2020-10-01                     
##  Max.   :2023-01-01

Start preprocessing the tweets, to calculate the intervalls some additional properties are needed. We also extract the emojis from the tweets. So the emojis are written as text and can be analyzed in the wordcloud better. We also detect the language of the tweets to make separate wordclouds.

# Preprocessing Step: Convert date and time to POSIXct and format according to date, year and university
tweets <- tweets %>%
  mutate(
    created_at = as.POSIXct(created_at, format = "%Y-%m-%d %H:%M:%S"),
    date = as.Date(created_at),
    day = weekdays(created_at),
    day = factor(day, levels = c(
      "Monday", "Tuesday",
      "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"
    )),
    year = year(created_at),
    university = as.character(university),
    language = detect_language(full_text),
    full_text_emojis = replace_emoji(full_text, emoji_dt = lexicon::hash_emojis)
  )

# Remove Emoji Tags helper funciton
remove_emoji_tags <- function(text) {
  str_remove_all(text, "<[a-z0-9]{2}>")
}

tweets$full_text_emojis <- sapply(tweets$full_text_emojis, remove_emoji_tags)

# Store emojis in a sep arate column to analyze later
tweets$emoji_unicode <- tweets %>%
  emoji_extract_nest(full_text) %>%
  select(.emoji_unicode)

Question 1: How many tweets are being posted by the various Universities when? Are there any ‘release’ strategies visible?

# Count each tweet by university and hour of the day
tweet_counts_by_hour_of_day <- tweets %>%
  group_by(university, timeofday_hour) %>%
  count() %>%
  arrange(university, timeofday_hour)

# Plot the number of tweets by university and hour of the day
ggplot(
  tweet_counts_by_hour_of_day,
  aes(
    x = timeofday_hour, y = n,
    color = university, group = university
  )
) +
  geom_line() +
  facet_wrap(~university) +
  labs(
    title = "Number of tweets by university and hour",
    x = "Hour of day",
    y = "Number of tweets"
  )

# Show most active hours for each university
hours_with_most_tweets_by_uni <- tweet_counts_by_hour_of_day %>%
  group_by(university, timeofday_hour) %>%
  summarize(total_tweets = sum(n)) %>%
  group_by(university) %>%
  slice_max(n = 1, order_by = total_tweets)

print(hours_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups:   university [8]
##   university     timeofday_hour total_tweets
##   <chr>          <chr>                 <int>
## 1 FHNW           09                      344
## 2 FH_Graubuenden 11                      493
## 3 ZHAW           17                      580
## 4 bfh            08                      497
## 5 hes_so         10                      315
## 6 hslu           09                      380
## 7 ost_fh         08                       44
## 8 supsi_ch       11                      330
# Show most active hour overall
hour_with_most_tweets <- tweet_counts_by_hour_of_day %>%
  group_by(timeofday_hour) %>%
  summarize(total_tweets = sum(n)) %>%
  arrange(desc(total_tweets)) %>%
  slice(1)

print(hour_with_most_tweets)
## # A tibble: 1 × 2
##   timeofday_hour total_tweets
##   <chr>                 <int>
## 1 11                     2356
# Count each tweet by university and weekday
tweet_counts_by_week_day <- tweets %>%
  group_by(university, day) %>%
  count() %>%
  arrange(university, day)

# Plot the number of tweets by university and day of the week
ggplot(tweet_counts_by_week_day, aes(
  x = day,
  y = n, color = university,
  group = university
)) +
  geom_line() +
  facet_wrap(~university) +
  labs(
    title = "Number of tweets by university and day of the week",
    x = "Day of the week", y = "Number of tweets"
  )

# Show most active days for each university
days_with_most_tweets_by_uni <- tweet_counts_by_week_day %>%
  group_by(university, day) %>%
  summarize(total_tweets = sum(n)) %>%
  group_by(university) %>%
  slice_max(n = 1, order_by = total_tweets)

print(days_with_most_tweets_by_uni)
## # A tibble: 8 × 3
## # Groups:   university [8]
##   university     day       total_tweets
##   <chr>          <fct>            <int>
## 1 FHNW           Tuesday            575
## 2 FH_Graubuenden Tuesday            751
## 3 ZHAW           Wednesday          636
## 4 bfh            Tuesday            651
## 5 hes_so         Tuesday            415
## 6 hslu           Thursday           603
## 7 ost_fh         Friday              65
## 8 supsi_ch       Friday             461
# Calculate time intervals between tweets
find_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

tweets <- tweets %>%
  arrange(university, created_at) %>%
  group_by(university) %>%
  mutate(time_interval = as.numeric(
    difftime(created_at, lag(created_at), units = "mins")
  ))

# Descriptive statistics of time intervals
summary(tweets$time_interval)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      0.0    148.2   1128.8   2097.6   2428.3 220707.0        8
# setwd("../4.Text-Mining-Groupwork/plots")
unique_years <- tweets$year %>% unique()
# Plot distribution of time intervals between tweets for each year
# for (curr_year in unique_years) {
#   # Filter data for the specific year
#   filtered_data <- tweets %>%
#     filter(year(created_at) == curr_year)

#   print(ggplot(filtered_data, aes(x = time_interval)) +
#     geom_histogram(fill = "lightblue") +
#     facet_wrap(~university) +
#     labs(
#       title = paste0(
#         "Distribution of time intervals between tweets - ", curr_year
#       ),
#       x = "Time interval (minutes)",
#       y = "Tweet count"
#     ))
#   universities <- filtered_data$university %>% unique()
#   for (uni in universities) {
#     # Filter data for the specific university
#     uni_filtered_data <- filtered_data %>%
#       filter(university == uni)

#     print(ggplot(uni_filtered_data, aes(x = time_interval)) +
#       geom_histogram(fill = "lightblue") +
#       labs(
#         title = paste0(
#           "Distribution of time intervals between tweets for ", uni,
#           " in ", curr_year
#         ),
#         x = "Time interval (minutes)",
#         y = "Tweet count"
#       ))
#     # Calculate mode (most common interval) in hours
#     most_common_interval_minutes <- find_mode(uni_filtered_data$time_interval)
#     most_common_interval_hours <- most_common_interval_minutes / 60
#     print(paste0(
#       "Most common time interval for ", uni,
#       " in ",
#       curr_year,
#       " is ", most_common_interval_minutes,
#       " minutes (", most_common_interval_hours, " hours)"
#     ))
#   }
# }

2. What are the tweets about and how do other Twitter users react to them (likes, etc.)?

Data Preprocessing

tweets_corpus <- corpus(tweets, text_field = "full_text_emojis")

# Removes: URLS, Punctuation, Numbers
tokens <- tokens(tweets_corpus,
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE
)

# Remove 'amp' as it is not meaningful because its only & symbol
# Remove 'rt' because it is an word e.g 'engagiert'.
# TODO: Check if languages are correct
extended_stopwords <- c(
  stopwords("en"),
  stopwords("fr"),
  stopwords("de"),
  stopwords("it"),
  "#fhnw", "#bfh", "@htw_chur", "#hslu", "#supsi",
  "amp", "rt", "fr", "ber"
)
# transform to lowercase
tokens <- tokens_tolower(tokens)
# Stem all words
tokens <- tokens_wordstem(tokens)
# Create n-grams of any length (including bigrams and trigrams)
tokens <- tokens_ngrams(tokens, n = 1)
# remove stopwords im multiple languages and remove university hashtags
tokens <- quanteda::tokens_select(tokens,
  pattern = extended_stopwords,
  selection = "remove"
)
# Create Document-feature-matrix
doc_matrix <- dfm(tokens)

Content Analysis

# Word Frequencies
word_freqs <- doc_matrix %>%
  colSums() %>%
  sort(decreasing = TRUE)

# Top 20 words
head(word_freqs, 20)
##       mehr       neue      right       heut      statt        bfh      studi 
##       1099        811        709        701        614        608        581 
##      neuen      thema     hes-so     findet      knnen  hochschul  schweizer 
##        536        533        533        527        518        507        504 
##    schweiz    projekt      arrow      zeigt studierend       gibt 
##        489        463        441        433        429        428
word_freqs_df <- data.frame(
  word = featnames(doc_matrix),
  freq = colSums(doc_matrix)
)

# Create the word cloud
set.seed(123)
wordcloud(
  words = word_freqs_df$word,
  freq = word_freqs_df$freq,
  min.freq = 5,
  max.words = 100,
  random.order = FALSE,
  rot.per = 0.35,
  colors = brewer.pal(8, "Dark2")
)

# TODO: Wordcloud per University

Userreaction Analysis

# TODO: Add retweet and other stuff
# Identify the tweets with the most likes
most_liked_tweets <- tweets %>%
  arrange(desc(favorite_count)) %>%
  head(1000)

# Analyze the posting time of the most liked tweets
most_liked_tweets_time <- most_liked_tweets %>%
  mutate(time_of_day = format(created_at, "%H"))

# Plot the distribution of the posting times
ggplot(most_liked_tweets_time, aes(x = as.numeric(time_of_day))) +
  geom_histogram(binwidth = 1, fill = "lightblue", color = "blue") +
  labs(
    title = "Distribution of Posting Times for Most Liked Tweets",
    x = "Hour of Day",
    y = "Frequency"
  )

Analyse the content of the most liked tweets

# Preprocessing content of most liked tweets
most_liked_tokens <- tokens(most_liked_tweets$full_text,
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE
)

# Apply the same stopword removal and transformations

most_liked_tokens <- tokens_tolower(most_liked_tokens)
most_liked_tokens <- tokens_wordstem(most_liked_tokens)
most_liked_tokens <- tokens_ngrams(most_liked_tokens, n = 1:3)
most_liked_tokens <- tokens_select(most_liked_tokens,
  pattern = extended_stopwords, selection = "remove"
)
most_liked_doc_matrix <- dfm(most_liked_tokens)

# Word Frequencies of Most Liked Tweets
most_liked_word_freqs <- most_liked_doc_matrix %>%
  colSums() %>%
  sort(decreasing = TRUE)

# Create a word cloud for most liked tweets
most_liked_word_freqs_df <- data.frame(
  word = featnames(most_liked_doc_matrix),
  freq = colSums(most_liked_doc_matrix)
)

set.seed(123)
wordcloud(
  words = most_liked_word_freqs_df$word,
  freq = most_liked_word_freqs_df$freq,
  min.freq = 2,
  max.words = 100,
  random.order = FALSE,
  rot.per = 0.35,
  colors = brewer.pal(8, "Dark2")
)

# TODO: Per university

Question 3: How do the university tweets differ in terms of content, style, emotions, etc?

Content Analysis (Word Clouds)

# for (uni in unique(tweets$university)) {
# Filter tweets by university
uni_tweets <- tweets %>%
  filter(university == "bfh")

uni_tokens <- tokens(uni_tweets$full_text,
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE
)
# transform to lowercase
uni_tokens <- tokens_tolower(uni_tokens)
# Stem all words
uni_tokens <- tokens_wordstem(uni_tokens)
# Create n-grams of any length (including bigrams and trigrams)
uni_tokens <- tokens_ngrams(uni_tokens, n = 1)

uni_tokens <- tokens_select(uni_tokens,
  pattern = extended_stopwords, selection = "remove"
)
uni_dfm <- dfm(uni_tokens)

uni_word_freqs_df <- data.frame(
  word = featnames(uni_dfm),
  freq = colSums(uni_dfm)
)

# Create the word cloud
set.seed(123)
wordcloud(
  words = uni_word_freqs_df$word,
  freq = uni_word_freqs_df$freq,
  min.freq = 5,
  random.order = FALSE,
  rot.per = 0.35,
  max.words = 100,
  colors = brewer.pal(8, "Dark2")
)

library(wordcloud2)
wordcloud2(uni_word_freqs_df, size = 0.5)
# TODO: Save wordcloud

# Analyze Top Emojis by University
emoji_count_per_university <- uni_tweets %>%
  top_n_emojis(full_text)

emoji_count_per_university %>%
  mutate(emoji_name = reorder(emoji_name, n)) %>%
  ggplot(aes(n, emoji_name)) +
  geom_col() +
  labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")

# }

bi_gram_tokens <- tokens_ngrams(tokens, n = 2)
bi_gram_matrix <- dfm(bi_gram_tokens)

bi_gram_freqs_df <- data.frame(
  word = featnames(bi_gram_matrix),
  freq = colSums(bi_gram_matrix)
)

# Create the bigram word cloud
set.seed(123) # For reproducibility
wordcloud(
  words = bi_gram_freqs_df$word,
  freq = bi_gram_freqs_df$freq,
  min.freq = 3,
  max.words = 100,
  random.order = FALSE,
  rot.per = 0.35,
  colors = brewer.pal(8, "Accent")
)

# Trigram Wordcloud
tri_gram_tokens <- tokens_ngrams(tokens, n = 3)
tri_gram_matrix <- dfm(tri_gram_tokens)

tri_gram_freqs_df <- data.frame(
  word = featnames(tri_gram_matrix),
  freq = colSums(tri_gram_matrix)
)

# Create the trigram word cloud
set.seed(123) # For reproducibility
wordcloud(
  words = tri_gram_freqs_df$word,
  freq = tri_gram_freqs_df$freq,
  min.freq = 2,
  max.words = 100,
  random.order = FALSE,
  rot.per = 0,
  colors = brewer.pal(8, "Paired")
)

### LDA Topic Modeling

new.dfm <- dfm_subset(doc_matrix, ntoken(doc_matrix) > 0) # löscht alle Reihen mit nur 0s
tweet_lda <- LDA(new.dfm, k = 5, control = list(seed = 123))
# Tidy the LDA results
topic_terms <- tidy(tweet_lda, matrix = "beta")
# Extract topics and top terms
topics <- as.data.frame(terms(tweet_lda, 50)) # First fifty words per topic

# Tidy the LDA results
tweet_lda_td <- tidy(tweet_lda)

# Extract top terms per topic
top_terms <- tweet_lda_td %>%
  group_by(topic) %>%
  top_n(8, beta) %>% # Show top 8 terms per topic
  ungroup() %>%
  arrange(topic, -beta)

# Visualize top terms per topic
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~topic, scales = "free") +
  scale_y_reordered() +
  labs(
    x = "Beta (Term Importance within Topic)",
    y = NULL,
    title = "Top Terms per Topic in Tweets (LDA)"
  )

# Most different words among topics (using log ratios)
diff <- tweet_lda_td %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001 | topic3 > .001) %>%
  mutate(
    logratio_t1t2 = log2(topic2 / topic1),
    logratio_t1t3 = log2(topic3 / topic1),
    logratio_t2t3 = log2(topic3 / topic2)
  )
diff
## # A tibble: 171 × 9
##    term       topic1  topic2  topic3  topic4  topic5 logratio_t1t2 logratio_t1t3
##    <chr>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl>         <dbl>         <dbl>
##  1 @bfh_hesb 1.66e-3 9.62e-4 1.86e-3 3.45e-4 3.41e-4        -0.790         0.157
##  2 @enginee… 5.06e-4 1.57e-3 1.56e-3 1.10e-3 4.09e-4         1.63          1.63 
##  3 @fhnw     2.70e-4 1.53e-3 3.34e-4 1.61e-3 1.05e-3         2.50          0.305
##  4 @fhnwbusi 1.63e-4 2.15e-3 2.31e-3 3.13e-4 2.98e-3         3.72          3.83 
##  5 @fhnwtec… 1.87e-3 4.11e-5 7.78e-4 1.02e-3 6.08e-4        -5.51         -1.27 
##  6 @hes_so   1.06e-3 6.60e-4 9.32e-4 6.51e-4 4.86e-4        -0.678        -0.179
##  7 @hsafhnw  1.27e-3 8.45e-4 4.57e-4 1.14e-3 1.73e-3        -0.584        -1.47 
##  8 @hslu     1.44e-3 4.08e-3 2.08e-3 6.85e-4 2.08e-4         1.50          0.525
##  9 @supsi_ch 8.45e-4 1.36e-3 5.18e-4 1.46e-4 2.96e-5         0.681        -0.705
## 10 @zhaw     6.62e-4 2.68e-3 3.90e-4 1.47e-3 8.58e-4         2.02         -0.763
## # ℹ 161 more rows
## # ℹ 1 more variable: logratio_t2t3 <dbl>
# Add topic probabilities to original data
# lda_gamma <- tidy(tweet_lda, matrix = "gamma")
# tweets <- tweets %>%
#  mutate(document_id = row_number()) %>% # Add a unique ID for each tweet
#  left_join(lda_gamma, by = c("document_id" = "document"))

# Analyze topics by university
# tweets %>%
#  count(university, topic, wt = gamma) %>%
#  group_by(university) %>%
#  slice_max(n = 3, order_by = n, with_ties = FALSE) %>%
#  ungroup() %>%
#  mutate(topic = paste0("Topic ", topic)) %>%
#  ggplot(aes(x = university, y = n, fill = topic)) +
#  geom_col(position = "dodge") +
#  labs(title = "Top 3 Topics by University", y = "Topic Proportion")

Style Analysis

tweets %>%
  mutate(tweet_length = nchar(full_text)) %>%
  ggplot(aes(x = tweet_length)) +
  geom_histogram() +
  labs(title = "Distribution of Tweet Lengths")

### Sentiment Analysis

# Calculate Sentiment for Supported Languages Only
langs <- c("de", "fr", "it", "en")

tweets_filtered <- tweets %>%
  filter(language %in% langs)

# Create Function to Get Syuzhet Sentiment
get_syuzhet_sentiment <- function(text, lang) {
  # Check if language is supported
  if (lang %in% langs) {
    return(get_sentiment(text, method = "syuzhet", lang = lang))
  } else {
    return(NA) # Return NA for unsupported languages
  }
}

# Calculate Syuzhet Sentiment for each Tweet
tweets_filtered$sentiment <-
  mapply(get_syuzhet_sentiment, tweets_filtered$full_text, tweets_filtered$lang)

# Sentiment over Time
tweets_filtered$month <- floor_date(tweets_filtered$created_at, "month")

plot_data <- tweets_filtered %>%
  group_by(university, month) %>%
  summarize(mean_sentiment_syuzhet = mean(sentiment, na.rm = TRUE))

# Plot Syuzhet Sentiment by all Universities
ggplot(plot_data, aes(
  x = month,
  y = mean_sentiment_syuzhet,
  color = university, group = university
)) +
  geom_line() +
  labs(
    title = "Mean Syuzhet Sentiment Over Time by University",
    y = "Mean Sentiment Score"
  ) +
  scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# TODO: Check sentiment and use syuzhet
# for (uni in unique(tweets$university)) {
uni <- "bfh"
uni_tweets <- tweets %>%
  filter(university == uni, lang %in% langs)

uni_tweets$sentiment <-
  mapply(get_syuzhet_sentiment, uni_tweets$full_text, uni_tweets$lang)

# Sentiment Over Time (Per University)
uni_tweets$month <- floor_date(uni_tweets$created_at, "month")
uni_tweets$year <- year(uni_tweets$month)

plot_data <- uni_tweets %>%
  group_by(year, month) %>%
  summarize(mean_sentiment = mean(sentiment, na.rm = TRUE))

# Plot Syuzhet Sentiment Over Time (Per University)
print(ggplot(plot_data, aes(x = month, y = mean_sentiment)) +
  geom_line(aes(color = as.factor(year))) +
  labs(
    title = paste0("Mean Syuzhet Sentiment Over Time by - ", uni),
    y = "Mean Sentiment Score"
  ) +
  scale_x_datetime(date_breaks = "1 month", date_labels = "%Y-%m") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~year, scales = "free_x"))

uni_tweets <- tweets %>%
  filter(university == uni, language %in% langs)

# Tokenize and Preprocess Words
uni_words <- tweets %>%
  unnest_tokens(word, full_text) %>%
  anti_join(stop_words) # remove Stopwords

# Join Sentiment with Words
sentiment_words <- uni_words %>%
  inner_join(get_sentiments("bing"), by = "word")

# Separate Positive and Negative Words
positive_words <- sentiment_words %>%
  filter(sentiment == "positive") %>%
  count(word, sort = TRUE)

negative_words <- sentiment_words %>%
  filter(sentiment == "negative") %>%
  count(word, sort = TRUE)

# Create and Display Word Clouds
par(mfrow = c(1, 2)) # Set up side-by-side plots

wordcloud(
  words = positive_words$word,
  freq = positive_words$n,
  random.order = FALSE,
  rot.per = 0.35,
  colors = brewer.pal(8, "Greens")
)
title(main = paste("Positive Words for", uni), line = 2)

wordcloud(
  words = negative_words$word,
  freq = negative_words$n,
  random.order = FALSE,
  rot.per = 0.35,
  colors = brewer.pal(8, "Reds")
)
title(main = paste("Negative Words for", uni), line = 2)

# }

Question 4: What specific advice can you give us as communication department of BFH based on your analysis? How can we integrate the analysis of tweets in our internal processes, can you think of any data products that would be of value for us?

Summary key insights from the analysis

# Language Analysis
tweets %>%
  count(lang) %>%
  arrange(desc(n))
## # A tibble: 99 × 3
## # Groups:   university [8]
##    university     lang      n
##    <chr>          <chr> <int>
##  1 bfh            de     3008
##  2 hslu           de     2988
##  3 ZHAW           de     2941
##  4 FH_Graubuenden de     2677
##  5 FHNW           de     2570
##  6 supsi_ch       it     1849
##  7 hes_so         fr     1716
##  8 supsi_ch       en      260
##  9 ost_fh         de      259
## 10 FH_Graubuenden en      228
## # ℹ 89 more rows
# Emoji Analysis
emoji_count <- tweets %>%
  top_n_emojis(full_text)

emoji_count %>%
  mutate(emoji_name = reorder(emoji_name, n)) %>%
  ggplot(aes(n, emoji_name)) +
  geom_col() +
  labs(x = "Count", y = NULL, title = "Top 20 Emojis Used")

# insights <- list(
#  "Most Active Hours" = hours_with_most_tweets_by_uni,
#  "Most Active Days" = days_with_most_tweets_by_uni,
#  "Most Common Time Intervals" = most_common_interval_minutes,
#  "Content Analysis" = head(word_freqs),
#  "Sentiment Analysis" = tweet_sentiment
# )

Recommendations:

1. Optimize tweet release times based on peak engagement hours.

2. Focus on specific days with high activity for important announcements.

3. Utilize sentiment analysis to tailor content that resonates positively with the audience.

4. Implement topic modeling to identify key themes and align communication strategy accordingly.